/*	This program creates an analysis dataset that includes periods without reported earnings
for the analysis of sample attrition */

***** Set directories 
local dir_raw 		"~/Dropbox/Retirement gaming/raw"
local dir_do 		"~/Dropbox/Retirement gaming/dataverse"
local dir_clean 	"~/Dropbox/Retirement gaming/clean"
local dir_output 	"~/Dropbox/Retirement gaming/output/dataverse"

local dataname "extmargin_medbcw.dta"


** GET MAIN SAMPLE AND MERGE WITH FULL DATASET
use "`dir_clean'/mainsample_medbcw.dta", clear
keep i ciiu2_1stobs small_1stobs large_1stobs ndep_cat_1stobs sample_self_empl tenure1yrs_at49 firmid_1stobs 
duplicates drop
merge 1:1 i using "`dir_clean'/mainsample_predictions.dta", keepusing(medbcw)
drop _merge
merge 1:m i using "`dir_clean'/admindata.dta"
keep if _m==3
drop _m


****** Mark industria & comercio (not drop)
drop aportaci_min 
gen sample_aportaci=aportaci==1
cap drop aux
g aux=aportaci!=1
bys i: egen anynotic=max(aux) // this flags those that have obs dropped because not in industria y comercio


*** Identify and drop people in early retirement regimes
drop if vf_min==103 // these are people working while already retired
drop if vf_min==97 | vf_min==98 // people reported with no service to the firm

*** MONTH OF OBSERVATION CENTERED 
*birthdate
format Fnac %td
g birth_month=mofd(Fnac)
format birth_month %tm
g refbday_month=birth_month+(12*50)
format refbday_month %tm
* age in months centered at 50's birthday
g agemonths_centered = t-refbday_month // this is number of months after ref age birthday
* age 
g agemonths=agemonths_centered+12*50
g age=agemonths/12
g agedisc=round(age)

* age in years centered around age 50 
g age_centered=floor(agemonths_centered/12)

****** RETIREMENT AGE
g tegr=mofd(Fegr)
format tegr %tm
g retirnow=(t==tegr & causal_5==1) // reported as leaving due to retirement
tab retirnow
* keep only first retirement obs
cap drop aux
g aux=t if retirnow==1
bysort i: egen tretir=min(aux)
format tretir %tm
drop aux
g postretir=0
replace postretir=1 if t>tretir
replace retirnow=0 if postretir==1
bysort i: egen retir_insample=max(retirnow)
tab retir_insample postretir

*** people with employement after retirement (not dropped anymore)
g postretir_work= W>0 & W<. & postretir==1

*SELF-EMPLOYED
g self_empl=status_1==1 

*EMPLOYED
g empl=status_3==1 


*** EARNINGS ****
sum W Wben unempl
replace Wben = W if Wben==.
replace Wben = amt_unempl + amt_mater + amt_sickness if Wben==. 
replace Wben = remC2 if Wben==0 & remC2>0 & remC2<.
* here I rename Wben as W
rename W W_noben
rename Wben W

* Drop if no earnings reported
drop if W==.
drop if W==0 // now this keeps periods with paid leave (if payments>0)


* salaried work observations 
g sample_salary = tipREM_min==1 

*** earnings outliers // NOT DROPPED HERE
sum W, det 
global Wmin=r(p5)
global Wmax=r(p95)
g wlow=W<$Wmin
g whigh=W>$Wmax
bys i: egen anylow=max(wlow)
bys i: egen anyhigh=max(whigh)
g anywout= anylow==1 | anyhigh==1



*** Aprox. firm size
bysort self_empl: sum ndep
*categorical
label define size_cat 0 "Micro less than 5 " 1 "Micro 5-9" 2 "Small 10-19" 3 "Small 20-49" 4 "Medium 50-249" 5 "Large 250 plus"
foreach X in ndep {
	g 		`X'_cat=0 if `X' <5 // micro 1 less than 5
	replace `X'_cat=1 if `X'>=5 & `X'<10 // micro 2
	replace `X'_cat=2 if `X'>=10 & `X'<20 // small 1
	replace `X'_cat=3 if `X'>=20 & `X'<50 // small 2
	replace `X'_cat=4 if `X'>=50 & `X'<250 // medium
	replace `X'_cat=5 if `X'>=250  // large
	label values `X'_cat size_cat
}
tab ndep_cat self_empl 

cap drop year
g year=yofd(dofm(t))


*** Keep only main job and drop duplicates
duplicates tag i t, g(tag)
// prioritize (among duplicates) jobs with salary pay over jobs with daily or commission pay
tab tag sample_salary
bys i t: egen maxsal=max(sample_salary) 
drop if tag>0 & maxsal==1 & sample_salary==0
drop tag maxsal
duplicates tag i t, g(tag)
bysort i t: egen maxrem=max(W)
drop if tag>0 & W<maxrem
drop tag
drop maxrem
duplicates tag i t, g(tag)
tab tag
sort i t
duplicates drop i t W, force
drop tag  


*JOB CHANGES
xtset i t
sort i t
cap drop auxj
bys i: gen auxj=l1.j
sort i t
replace auxj=j[_n-1] if auxj==. & i[_n]==i[_n-1]
g jobchange=auxj!=j if auxj!=. 
bys i agedisc: egen jobchange_atage=max(jobchange) 
replace jobchange_atage=. if jobchange==.


*** Cohorts in sample // this is already selected in the sample
g cohort=yofd(Fnac)
keep if birth_month>=tm(1941m4) & birth_month<tm(1971m4)


************************************************************
*** MONTH OF OBSERVATION CENTERED AROUND *PREDICTED* BCW ***
************************************************************
foreach X in medbcw {
	g refbday_month_`X'=refbday_month+round(12*`X') if `X'!=.
	sum refbday_month_`X'
	format refbday_month_`X' %tm
	* age in months centered at 50's birthday
	g timemonths_bcw_`X' = t-refbday_month_`X' // this is number of months after ref age birthday
	* age in years centered around predicted start BCW
	g time_bcw_`X'=.
	forvalues y = -5(1)7 {
		local min=`y'*12
		local max=(`y'+1)*12
		replace time_bcw_`X'=`y' if timemonths_bcw_`X'>=`min' & timemonths_bcw_`X'<`max'
	}
	replace time_bcw_`X'=-6 if time_bcw_`X'==. & timemonths_bcw_`X'<-60 & age_centered>=-5 // lump all obs more than 5 years before start of BCW that belong to our main sample
}
rename time_bcw_medbcw time_bcw
rename timemonths_bcw_medbcw timemonths_bcw


*** Keep observations in the relevant interval around ** Predicted BCW **
keep if agemonths_centered>=-60 & timemonths_bcw<96 // from age 45 until 8 yrs after start BCW
sum agemonths_centered timemonths_bcw 
tab time_bcw 


*** 2-digit CIIU
cap drop aux
tostring ciiu, g(aux)
g ciiu2=substr(aux,1,2)
destring ciiu2, replace
drop aux


foreach X in self_empl empl {  
	bysort i t: egen `X'_mth=max(`X')
	bysort i: egen `X'_any=max(`X')
}
g inboth=self_empl_mth==1&empl_mth==1
bysort i: egen iinboth=max(inboth)
tab iinboth // this indicates that a person is simultaneously employed and self-employed at some time in the sample period


* DROP from sample of empl those who ever have self_employment
foreach X in empl self_empl {
	bysort i: egen `X'_any2=max(`X')
}

tab sample_self_empl self_empl_any2
tab sample_self_empl self_empl


*** firm size outliers // NOT DROPPED 
bys i: egen maxsize=max(ndep)
g anyfsizeout = maxsize >= 5000
drop maxsize

 
* Keep relevant variables
keep i t j year age age_centered agemonths_centered time_bcw timemonths_bcw medbcw refbday_month_medbcw *empl* W* remC1_sum remC2_sum remC3_sum amt_* ben ciiu*  Tipocontr ipc ndep ndep_cat *sample* birth_month  cohort Fing Fegr hrsmonth trem_*_max status_*_max *jobchange* any* *_1stobs wlow whigh


*************************************************
*** Additional variables
*************************************************


sort i self_empl t
bysort i self_empl: g order=_n


bysort i: egen minage=min(age)
bysort i: egen maxage=max(age)

gen 	ciiu1 = 1 if ciiu2<10											// Agriculture and mining	
replace ciiu1 = 2 if (ciiu2>=10 & ciiu2<=33) | ciiu2==95 				// Manufacturing 
replace ciiu1 = 3 if ciiu2>=35 & ciiu2<=39 								// Energy and waste disposal
replace ciiu1 = 4 if ciiu2>=41 & ciiu2<=43  							// Construction
replace ciiu1 = 5 if (ciiu2>=45 & ciiu2<=47) | (ciiu2>=55 & ciiu2<=56)	// wholesale and retail, restaurants, hotels
replace ciiu1 = 6 if (ciiu2>=49 & ciiu2<=53) | ciiu2==61				// transport and communications 
replace ciiu1 = 7 if (ciiu2>=62 & ciiu2<=82) | ciiu2==96				// services
replace ciiu1 = 8 if (ciiu2>=84 & ciiu2<=94) | (ciiu2>=58 & ciiu2<=60)| ciiu2==97 // public admin, social and domestic services

g manufacturing		= ciiu1==2
g retailhospitality	= ciiu1==5 
g transportenergy	= ciiu1==6 | ciiu1==3
g services 			= ciiu1==7 | ciiu1==8 | ciiu1==4 

foreach var in manufacturing retailhospitality transportenergy services {
	replace `var'=. if ciiu2==.
}
label var manufacturing 	"Manufacturing"
label var retailhospitality "Retail, Restaurants, Hotels"
label var transportenergy 	"Transport, Communications, Energy"
label var services 			"Services, Other"

g sector=1 if manufacturing==1
replace sector=2 if retailhospitality==1
replace sector=3 if transportenergy==1
replace sector=4 if services==1

g jobstart=mofd(Fing)
format jobstart %tm
g tenure=t-jobstart
sum tenure if self_empl==1, det
sum tenure if empl==1, det
g tenure_1yrs=tenure>=(12) if tenure!=.


** FIRM SIZE CATEGORIES **
replace ndep_cat=ndep_cat+1
replace ndep_cat=0 if ndep==0
cap label drop ndep_cat
label define ndep_cat 0 "no employees" 1 "1-4" 2 "5-9" 3 "10-19" 4 "20-49" 5 "50-249" 6"250 plus" 
label values ndep_cat ndep_cat


g othpay=trem_2_max==1|trem_3_max==1|trem_4_max==1|trem_5_max==1

sum hrsmonth, det
replace hrsmonth=. if hrsmonth<r(p5)

g wagephr= W/hrsmonth if hrsmonth>0

save "`dir_clean'/`dataname'.dta", replace

******
* RE MERGE WITH SAMPLE
use "`dir_clean'/mainsample_medbcw.dta"
keep if self_empl==1 | (empl==1 & small_1stobs==1) | (empl==1 & large_1stobs==1)
keep i
duplicates drop
merge 1:m i using  "`dir_clean'/`dataname'.dta"
keep if _m==3
drop _m
save "`dir_clean'/`dataname'.dta", replace
***


********************************************************************************
**** PREPARE DATA FOR EVENT STUDIES 							
********************************************************************************

* 0-3 post bcw dummy
g post03=time_bcw>=0 & time_bcw<4
label var post03 "0-3 yrs. post start BCW"
* 4+ post bcw dummy
g post4=time_bcw>=4
label var post4 "4+ yrs. post start BCW"
* 2+ pre bcw 
g pre2=time_bcw<=-2
label var pre2 "2+ yrs. pre start BCW"
* 2-5 pre bcw 
g pre25=time_bcw>=-5 & time_bcw<=-2
label var pre25 "2-5 yrs. pre start BCW"
* 6+ pre bcw 
g pre6=time_bcw<=-6 
label var pre6 "6+ yrs. pre start BCW"	
* Post start BCW
g post0=time_bcw>=0 
label var post0 "Post start BCW"
* Pre 46-48 (to drop two dummies)
g pre24=time_bcw>-5 & time_bcw<=-2
label var pre24 "2-4 yrs. pre start BCW"

*Interactions age trend and shifts
g bcwttrend=timemonths_bcw/12
label var bcwttrend "Event-time trend"
gen bcwttrend_post0 = bcwttrend*post0
label var bcwttrend_post0 "Post BCW x event-time trend"
gen bcwttrend_post4 = bcwttrend*post4
label var bcwttrend_post4 "4+ yrs. post BCW x event-time trend"
gen bcwttrend_pre6 = bcwttrend*pre6
label var bcwttrend_pre6 "6+ yrs. pre BCW x event-time trend"


* INTERACTIONS FOR DID
foreach var in bcwttrend post03 post4 pre2 pre25 pre24 pre6 post0 bcwttrend_post0 bcwttrend_pre6  {
g `var'_self_empl=`var' * self_empl
}
label var post03_self_empl 			"Self-employed x 0-3 post BCW"
label var post4_self_empl 			"Self-employed x 4+ post BCW"
label var pre2_self_empl 			"Self-employed x 2+ pre BCW"
label var pre25_self_empl 			"Self-employed x 2-5 pre BCW"
label var pre24_self_empl 			"Self-employed x 2-4 pre BCW"
label var pre6_self_empl 			"Self-employed x 6+ pre BCW"
label var post0_self_empl			"Self-employed x post BCW"
label var bcwttrend_self_empl	 	"Self-employed x event-time trend"
label var bcwttrend_post0_self_empl "Self-employed x post BCW x trend"
label var bcwttrend_pre6_self_empl 	"Self-employed x 6+ pre BCW x trend"


* LARGE SIZE FIRM (FOR EMPLOYEES)
foreach X in bcwttrend post03 post4 pre2 pre25 pre24 pre6 post0 bcwttrend_post0 bcwttrend_pre6  {
g `X'_larger= `X'*large_1stobs
}
label var post03_larger 			"Large firm x 0-3 post BCW"
label var post4_larger 				"Large firm x 4+ post BCW"
label var pre2_larger 				"Large firm x 2+ pre BCW"
label var pre25_larger 				"Large firm x 2-5 pre BCW"
label var pre24_larger 				"Large firm x 2-4 pre BCW"
label var pre6_larger 				"Large firm x 6+ pre BCW"
label var post0_larger				"Large firm x post BCW"
label var bcwttrend_larger	 		"Large firm x event-time trend"
label var bcwttrend_post0_larger 	"Large firm x post BCW x trend"
label var bcwttrend_pre6_larger 	"Large firm x 6+ pre BCW x trend"

* SMALL SIZE FIRM (FOR EMPLOYEES)
foreach X in bcwttrend post03 post4 pre2 pre25 pre24 pre6 post0 bcwttrend_post0 bcwttrend_pre6 {
g `X'_small= `X'*small_1stobs
}
label var post03_small 				"Small firm x 0-3 post BCW"
label var post4_small 				"Small firm x 4+ post BCW"
label var pre2_small 				"Small firm x 2+ pre BCW"
label var pre25_small 				"Small firm x 2-5 pre BCW"
label var pre24_small 				"Small firm x 2-4 pre BCW"
label var pre6_small 				"Small firm x 6+ pre BCW"
label var post0_small				"Small firm x post BCW"
label var bcwttrend_small	 		"Small firm x event-time trend"
label var bcwttrend_post0_small 	"Small firm x post BCW x trend"
label var bcwttrend_pre6_small 		"Small firm x 6+ pre BCW x trend"

save "`dir_clean'/`dataname'.dta", replace


clear all
exit


